In [1]:
from scipy.special import logsumexp
from scipy.stats import norm
import sys 
sys.argv[1]=1
from main import *
import os
import pickle
from plot_config import *
import warnings
warnings.filterwarnings("ignore")
In [2]:
# power will change when readed to list 10** [200, 1000, 5000, 10000, 20000]
mcmc_setup
Out[2]:
{'cluster_maximum': 7, 'power': 10, 'iterations': 4000, 'n_chains': 4}
In [3]:
model_setup
Out[3]:
{'power': 10, 'gem_alpha': 0.5}
In [4]:
data_config
Out[4]:
{'alpha': 500,
 'n_sample': 5000,
 'w0': [0.7, 0.3],
 'mu0': [-2, 2],
 'sigma0': [0.7, 0.8]}
In [5]:
power_list = 10**np.round(np.linspace(1,4,31),1)

1. Load Fitted Model

*** Need to run Rob_bayes_Experiment.py first to generate .pickle files

In [6]:
def generate_list(path):
    """
        Function to generate a list of fitted stan name
    
    """
    files = os.listdir(path)

    data_list = []
    fitted_exact = []
    fitted_approx = []
    fitted_standard = []

    for sub_files in files:
        if sub_files.startswith('fit_correct'):
            fitted_exact.append(sub_files)
        elif sub_files.startswith('fit_approx'):
            fitted_approx.append(sub_files)
        elif sub_files.startswith('fit_standard'):
            fitted_standard.append(sub_files)
        elif sub_files.startswith('data'):
            data_list.append(sub_files)
    return fitted_standard, fitted_exact, fitted_approx, data_list

    

def load_fitted_object(pickle_names, path):
    """
        Function to load pickle object,
        params: names, List of pickle names
        params: directory to which the those pickles are stored

    """
    List = []
    for sub_name in pickle_names:
        print(path + sub_name)
        with open(path + sub_name, "rb") as f:
            fitted_object = pickle.load(f)

        List.append(fitted_object)
    return List

def estimate_pi_k(fitted_object, threshold = 0.02):
    """
        Function to estimate the probability of total clusters equal to k,
        params: fitted_object, fitted pystan object, 
        return, prob_list, a list of probabilities indicating cluster equal to k.

    """
    total_samples = mcmc_setup['iterations'] * mcmc_setup['n_chains']/2
    prob_list = []
    for i in range(1, mcmc_setup['cluster_maximum']+1):
        prob_list.append(
            np.sum(np.sum(fitted_object['fit']['theta'] > threshold, axis=1) == i) / total_samples)

    return prob_list    


def p_xj(i, fitted_object):
    """
        Sub function to calculate the pxj using weight * prob (i| mu, sigma) for the i th sample
    """
    
    normal_density = lambda  x, mu, sigma2: 0.5/np.pi *np.exp( -(x-mu)**2/ (2*sigma2) )


    n = int(mcmc_setup['iterations'] * mcmc_setup['n_chains']/2)

    f_x = normal_density(i, fitted_object['fit']['mu'].reshape([n , mcmc_setup['cluster_maximum']]), fitted_object['fit']['sigma2'])

    weights = fitted_object['fit']['theta']
    
    return np.sum(f_x * weights, axis=1)




def calc_loglikeli_changed2(fitted_object):
    """
        Function to calculate the log likelihood using the posterior samples
    """
        
    n = int(mcmc_setup['iterations'] * mcmc_setup['n_chains']/2)
    data = fitted_object['fit'].data['y']
    log_theta = fitted_object['fit']['theta']
    
    mu_matrix = fitted_object['fit']['mu']
    mu_matrix = mu_matrix.reshape([1, n , mcmc_setup['cluster_maximum']])

    sigma2_matrix = fitted_object['fit']['sigma2']
    sigma2_matrix = sigma2_matrix.reshape([1, n, mcmc_setup['cluster_maximum']])

    data = np.reshape(data, [data_config['n_sample'],1, 1])

    lnorm_pdf = np.log(norm.pdf(data, mu_matrix, sigma2_matrix))

    lnorm_pdf_total = np.sum(lnorm_pdf, axis=0)
    
    return np.mean(logsumexp( log_theta + lnorm_pdf_total, axis=1))



def load_estimate_pi_k(path, model, threshold = 0.02 ,neighbor= 2):
    """
        Function to generaete a plot indicating how many clusters are there
        params: path, directory to which the those pickles are stored

    """
    regex_alpha = r''+str(model)+'(\d.*?).pickle'
    pat_alpha = re.compile(regex_alpha, re.S)
    fitted_standard, fitted_exact, fitted_approx, data_list = generate_list(
    path)
    power_list = np.array(pat_alpha.findall(str(fitted_exact)[1:-1]), dtype=float)
    
    if model == 'fit_correct':
        model_list= [fitted_exact]
    elif model == 'fit_approx':
        model_list= [fitted_approx]
    
    like_li_list = []
    pi_k_list = []
    for model_names in model_list:
        #plt.figure(figsize=(10, 6))
        fitted_model = load_fitted_object(model_names, path)
        
        for model_object in fitted_model:
            like_li_list.append(calc_loglikeli_changed2(model_object))
            
            pi_k_list.append(np.dot(range(1, mcmc_setup['cluster_maximum']+1),estimate_pi_k(model_object, threshold = threshold)))
            
            
    
    plt.scatter(pi_k_list,like_li_list)
    plt.title('Calibration of Alpha for ' + str(model)+ '')
    plt.ylabel('E[LogLikeli |data]')
    plt.xlabel('E[K_{2%}|data]')
    try:
        close_to_true = np.argpartition(np.abs(np.array(pi_k_list) - len(data_config['w0'])), neighbor)[:neighbor]
        alpha_index = np.argmax(np.array(like_li_list)[close_to_true])
        best_alpha = power_list[close_to_true][alpha_index]
        likeli_correct = np.array(like_li_list)[close_to_true]
        plt.plot(np.array(pi_k_list)[close_to_true][alpha_index], likeli_correct[alpha_index], 'rx')
        plt.text(np.array(pi_k_list)[close_to_true][alpha_index], likeli_correct[alpha_index], 'alpha='+str(np.round(best_alpha,2)))
    except:
        print('No alpha could give the correct cluster number')


    return like_li_list, pi_k_list, power_list



def plot_estimate_pi_k(model, path, alpha= None):
    
    if model =='fit_correct':
        files = np.load( path +'fit_correct.npz')        
    elif model =='fit_approx':
        files = np.load(path + 'fit_approx.npz')
    
    like_li_list = files['arr_0']    
    pi_k_list = files['arr_1']    
    power_list = files['arr_2']
    
    left_point_index = np.argmin(pi_k_list)

    left_cluster = pi_k_list[left_point_index]
    left_likeli = like_li_list[left_point_index]

    flag1 = (like_li_list <= left_likeli)
    flag2 = (like_li_list >= left_likeli)
    lower_plot = np.array([ like_li_list, pi_k_list])[:,flag1]
    upper_plot = np.array([ like_li_list, pi_k_list])[:,flag2]
    plt.figure(figsize=(10, 6))
    plt.plot(lower_plot[1][lower_plot.argsort()[1]], lower_plot[0][lower_plot.argsort()[1]], 'b')
    plt.plot(upper_plot[1][upper_plot.argsort()[1]], upper_plot[0][upper_plot.argsort()[1]], 'b')

    plt.plot(left_cluster, left_likeli, 'rx')
    
    plt.text(left_cluster, left_likeli, 'alpha='+str(np.round(power_list[left_point_index],2)))

    plt.title('Calibration of Alpha for ' + str(model)+ '')
    plt.ylabel('E[LogLikeli |data]')
    plt.xlabel('E[K_{2%}|data]')
    if alpha is not None:
        
        plt.plot(pi_k_list [[power_list == alpha]], like_li_list [power_list == alpha], 'gx')
        plt.text(pi_k_list [[power_list == alpha]], like_li_list [power_list == alpha], 'alpha=' + str(np.round(power_list[power_list==alpha],2)[0]))

    
        
        
In [ ]:
path = "Alpha_result/"
data_names = generate_list(path)[-1]
data_lists = load_fitted_object(data_names, path)

n_sample_list = pat.findall(str(data_names))
n_sample_cycler = cycle(n_sample_list)
    
for data in data_lists:
    sns.distplot(data['pertubation'], label='Pertubated_Data')
    sns.distplot(data['true'], label='True_Data')
    plt.title('n_sample = ' + next(n_sample_cycler))
    plt.legend()
    plt.show()

2. Estimating the Probabilities of Total Clusters

In [ ]:
#a,b,c = load_estimate_pi_k(path, 'fit_correct', threshold = 0.02 ,neighbor= 2)
#np.savez('Alpha_result/fit_correct.npz',a ,b, c)
In [8]:
plot_estimate_pi_k('fit_correct', path)

3. Divergence Analysis

** First Input the name of fitted pickle that you want to check for divergence

In [7]:
path = 'Alpha_result/'
In [9]:
fitted_names = ['fit_correct10000.pickle']
fitted_object = load_fitted_object(fitted_names, path)[0]
data = load_fitted_object(['data' + pat.findall(str(fitted_names))[0] + '.pickle'], path)[0]
Alpha_result/fit_correct10000.pickle
Alpha_result/data10000.pickle

3.1 $R_{hat}$ table, trace plot, autocorrelation plot and pair plot

In [10]:
az_v_theta_plot(fitted_object['fit'])
Inference for Stan model: anon_model_ae25ae1af289c1ddc87bacb0faa65d2c.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

            mean se_mean     sd    2.5%    25%    50%    75%  97.5%  n_eff   Rhat
v[1]        0.31  7.1e-4   0.01    0.29    0.3   0.31   0.31   0.33    202   1.02
v[2]        0.99  2.1e-3   0.02    0.93   0.99    1.0    1.0    1.0     81   1.04
v[3]        0.72  6.2e-3   0.29    0.07   0.53   0.83   0.97    1.0   2093    1.0
v[4]        0.67  7.5e-3    0.3    0.06   0.44   0.76   0.94    1.0   1580    1.0
v[5]        0.66  8.8e-3    0.3    0.05   0.43   0.74   0.93    1.0   1127   1.01
v[6]        0.67  4.1e-3   0.29    0.05   0.46   0.74   0.93    1.0   5165    1.0
v[7]        0.66    0.01    0.3    0.05   0.41   0.74   0.94    1.0    818   1.01
mu[1,1]     1.97  1.5e-3   0.03     1.9   1.95   1.97   1.99   2.03    412   1.01
mu[2,1]    -1.99  8.9e-4   0.02   -2.02   -2.0  -1.99  -1.98  -1.96    353   1.01
mu[3,1]     0.06    0.06   3.56   -7.56  -1.94   0.07   2.31   7.15   3351    1.0
mu[4,1]    -0.01    0.08   4.41    -8.9  -2.77 3.1e-3   2.72   9.14   3045    1.0
mu[5,1]    -0.03    0.08   4.71   -9.54  -3.15  -0.02   3.08   9.23   3302    1.0
mu[6,1]     0.32    0.17   5.04   -9.42  -3.05   0.28   3.75   9.92    932    1.0
mu[7,1]    -0.12    0.08    4.9   -9.82  -3.39  -0.17   3.16   9.64   3469    1.0
sigma2[1]    0.7  2.0e-3   0.04    0.62   0.68    0.7   0.73   0.78    378   1.01
sigma2[2]   0.51  9.1e-4   0.02    0.48    0.5   0.51   0.52   0.55    368   1.01
sigma2[3]   3.85    0.43  28.05    0.25   0.63   1.15   2.43   17.5   4333    1.0
sigma2[4]   5.74    0.71   36.5    0.27   0.74   1.41   3.23  28.34   2615    1.0
sigma2[5]   7.69    0.97  42.48    0.27   0.74   1.44   3.86  44.93   1904    1.0
sigma2[6]   7.74    1.05  72.34    0.27   0.75   1.51   3.58  36.75   4757    1.0
sigma2[7]   8.23    1.18  89.54    0.27   0.76   1.55   3.83  56.89   5729    1.0
theta[1]    0.31  7.1e-4   0.01    0.29    0.3   0.31   0.31   0.33    202   1.02
theta[2]    0.69  1.5e-3   0.01    0.64   0.68   0.69   0.69    0.7     95   1.04
theta[3]  5.6e-3  1.5e-3   0.01  8.7e-7 1.1e-4 7.1e-4 4.0e-3   0.05     77   1.05
theta[4]  6.5e-4  5.9e-5 1.8e-3  2.3e-8 8.3e-6 7.0e-5 4.4e-4 5.8e-3    948   1.01
theta[5]  1.9e-4  3.3e-5 6.8e-4 9.0e-10 7.0e-7 1.0e-5 8.8e-5 1.9e-3    428   1.01
theta[6]  5.3e-5  7.2e-6 2.5e-4 5.2e-11 7.5e-8 1.6e-6 2.1e-5 4.0e-4   1180    1.0
theta[7]  3.1e-5  5.6e-6 2.0e-4 5.6e-12 1.4e-8 4.2e-7 7.0e-6 2.1e-4   1305    1.0
lp__       -5831    0.12   3.77   -5840  -5834  -5831  -5829  -5825    938   1.01

Samples were drawn using NUTS at Thu Jun 18 11:47:29 2020.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
In [11]:
az_mu_sigma_plot(fitted_object['fit'])
In [12]:
fitted_object['model'].show()
StanModel object 'anon_model_ae25ae1af289c1ddc87bacb0faa65d2c' coded as follows:

data {
 int D; //number of dimensions
 int K; //number of gaussians
 int N; //number of data
 vector[D] y[N]; //data
 real alpha; // components weight prior
 real power;// power posterior
}

parameters {
 real <lower=0,upper=1> v[K]; // betas
 vector[D] mu[K]; //mixture component means
 vector<lower=0>[K] sigma2; //mixture component sigma
}

transformed parameters{
  simplex[K] theta; //mixing proportions
  theta[1] = v[1];
  // stick-break process 
  for(j in 2:(K-1)){
      theta[j]= v[j]*(1-v[j-1])*theta[j-1]/v[j-1]; 
  }
  theta[K]=1-sum(theta[1:(K-1)]); // to make a simplex.
}

model {
real ps[K]; 

 for(k in 1:K){
 mu[k] ~ normal(0,5);
 sigma2[k] ~ inv_gamma(1, 1);
 v[k] ~ beta(1, alpha);   
 }
 
 for (n in 1:N){
 for (k in 1:K){
 ps[k] =log(theta[k])+normal_lpdf(y[n] | mu[k], sqrt(sigma2[k])); //increment log probability of the gaussian
 }
 target += power/(power+N)*log_sum_exp(ps);
 }
}

3.2 Trace plot with P(k=j) > threshold

In [13]:
y_hat = plot( fitted_object ,threshold = 0.02)
plt.title(fitted_names[0] )
Out[13]:
Text(0.5, 1.0, 'fit_correct10000.pickle')

4. Predictive Posterior Check

In [14]:
plt.figure(figsize=(10, 6))
sns.distplot(data['true'], label='True Y')
sns.distplot(y_hat, label ='Y hat')
plt.title('Posterior Prediective')
plt.legend()
Out[14]:
<matplotlib.legend.Legend at 0x2b402abd70f0>
In [ ]:
# sampling code for posterior check
pred_y = []
for j in range(np.shape(fit['fit'].to_dataframe())[0]):

    k_temp = np.random.multinomial(1, fit['fit'].extract('theta')['theta'][j, :])
    mu_temp = fit['fit'].extract('mu')['mu'][j, :]
    sigma_temp = fit['fit'].extract('sigma2')['sigma2'][j, :]

    pos_mu = np.dot(mu_temp.T, k_temp)
    pos_sigma = np.dot(sigma_temp.T, k_temp)
    pred_y.append(np.random.normal(pos_mu, np.sqrt(pos_sigma)))